%load_ext autoreload
%autoreload 2
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
import pandas as pd
from scipy import signal
from spectrogramtools import *
from collections import OrderedDict
A = np.array([[1], [2], [5]])
B = np.array([[1, 0, 0, 2, 0, 0, 3, 0, 0, 1]])
print(A.dot(B))
y_voice, sr = librosa.load("princevoice.wav", sr=22050)
ipd.Audio(y_voice, rate=sr)
y_drum, sr = librosa.load("princedrum.wav", sr=22050)
ipd.Audio(y_drum, rate=sr)
win_length = 2048*3
hop_length = 512
SVoice = STFT(y_voice, win_length, hop_length, useLibrosa=False)
SVoice = SVoice[:, 0]
SVoice = np.reshape(SVoice, (SVoice.size, 1))
SDrum = STFT(y_drum, win_length, hop_length, useLibrosa=False)
SDrum = SDrum[:, 0]
SDrum = np.reshape(SDrum, (SDrum.size, 1))
plt.subplot(121)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(SDrum), ref=np.max), x_axis='time')
plt.title("Drum")
plt.subplot(122)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(SVoice), ref=np.max), x_axis='time')
plt.title("Voice")
H = np.zeros((1, 1000))
N = 40
T = 20
H[0, 0:N*T:T] = 1/(1+np.arange(N))
S = SVoice.dot(H)
y = iSTFT(S, win_length, hop_length)
plt.subplot(211)
plt.plot(H[0, :])
plt.subplot(212)
plt.imshow(np.log(1 + np.abs(S)/1e-3), aspect='auto', cmap='magma_r')
plt.gca().invert_yaxis()
ipd.Audio(y, rate=sr)
H = np.zeros((1, 500))
H[0, 0::20] = 1
V = SDrum.dot(H)
y = iSTFT(V, win_length, hop_length)
ipd.Audio(y, rate=sr)
H = np.zeros((1, 500))
H[0, 0::20] = 1
H[0, 0::15] = 1
V = SVoice.dot(H)
y = iSTFT(V, win_length, hop_length)
ipd.Audio(y, rate=sr)
plt.plot(H[0, :])
We create a 4 on 3 rhythm where the 3 is voice and the 4 is drum by doing two separate matrix multiplications and adding them together
H1 = np.zeros((1, 500))
H1[0, 0::20] = 1
H2 = np.zeros((1, 500))
H2[0, 0::15] = 1
V = SVoice.dot(H1) + SDrum.dot(H2)
y = iSTFT(V, win_length, hop_length)
ipd.Audio(y, rate=sr)
But there's a more elegant way to do this. If we create a matrix $A$ with two columns, where the first is the voice spectrogram window and the second is the drum spectrogram window
SBoth = np.concatenate((SVoice, SDrum), axis=1)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(SBoth), ref=np.max), x_axis='time')
And then we create a matrix with two rows, then the first row corresponds to activations of the voice and the second row corresponds to activations of the drum, and we can do a single matrix multiplication of the $M x 2$ and $2 x N$ matrices. The matrix $H$ can be thought of as a little musical score where each row says which instrument is active over different points of time
H = np.zeros((2, 500))
H[0, 0::20] = 1 # First row corresponds to activations of first column
H[1, 0::15] = 1 # Second row corresponds to activations of second column
V = SBoth.dot(H)
y = iSTFT(V, win_length, hop_length)
plt.figure(figsize=(12, 4))
plt.imshow(H, aspect='auto', interpolation='none')
ipd.Audio(y, rate=sr)